{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0cb1deb7",
   "metadata": {},
   "source": [
    "### Identification of DUP transcript fusion candidates"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "170020c8",
   "metadata": {},
   "source": [
    "Criteria: \n",
    "\n",
    "* Duplication overlaps 3' end of misexpressed gene \n",
    "* Duplication partially overlaps 5' end of an expressed gene \n",
    "* Misexpressed and readthrough gene on the same strand \n",
    "* Misexpressed gene upstream of transcribed gene "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1293ce64",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "from pybedtools import BedTool\n",
    "from io import StringIO\n",
    "import pysam\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c0fac69f",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir=\"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3\"\n",
    "wkdir_path = Path(wkdir)\n",
    "# inputs\n",
    "misexp_vrnt_feat_path = wkdir_path.joinpath(\"6_misexp_dissect/vrnt_features/misexp_vrnt_features.tsv\")\n",
    "gencode_bed_path = wkdir_path.joinpath(\"3_misexp_genes/bed_files/all_genes.bed\")\n",
    "tpm_mtx_path = wkdir_path.joinpath(\"1_rna_seq_qc/tpm_mtx/tpm_4568samples_59144genes_smpl_qc.csv\")\n",
    "# constants \n",
    "tpm_cutoff = 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e6be28a8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of genes with median TPM > 0.5: 17418\n"
     ]
    }
   ],
   "source": [
    "# only include overlapping genes that are transcribed (median TPM > 0.5)\n",
    "# load gene expression matrix \n",
    "tpm_mtx_df = pd.read_csv(tpm_mtx_path)\n",
    "# add median expression of gene across INTERVAL \n",
    "median_tpm_df = pd.DataFrame(tpm_mtx_df.set_index(\"gene_id\").median(axis=1))\n",
    "median_tpm_df = median_tpm_df.rename(columns={0:\"median_tpm\"}).reset_index()\n",
    "genes_median_tpm05 = median_tpm_df[median_tpm_df.median_tpm > tpm_cutoff].gene_id.unique()\n",
    "print(f\"Number of genes with median TPM > {tpm_cutoff}: {len(genes_median_tpm05)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3fc51698",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of misexpression-associated duplications: 16\n"
     ]
    }
   ],
   "source": [
    "misexp_vrnt_feat_df = pd.read_csv(misexp_vrnt_feat_path, sep=\"\\t\")\n",
    "misexp_dup_feat_df = misexp_vrnt_feat_df[misexp_vrnt_feat_df.SVTYPE == \"DUP\"]\n",
    "misexp_gene_cols = {\"gene_id\": \"misexp_gene_id\", \"gene_start\": \"misexp_gene_start\", \n",
    "                    \"gene_end\": \"misexp_gene_end\", \"gene_strand\": \"misexp_gene_strand\"}\n",
    "misexp_dup_feat_df = misexp_dup_feat_df.rename(columns=misexp_gene_cols)\n",
    "\n",
    "misexp_dups = misexp_dup_feat_df.vrnt_id.unique()\n",
    "print(f\"Number of misexpression-associated duplications: {len(misexp_dups)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e0f0f157",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of DUP overlapping the gene 3' end: 5\n"
     ]
    }
   ],
   "source": [
    "# Overlap 3' end of gene \n",
    "dup_overlap_3prime_gene_df = misexp_dup_feat_df[misexp_dup_feat_df.position == \"Partial overlap 3' end\"]\n",
    "dup_overlap_3prime_gene_vrnt_id = dup_overlap_3prime_gene_df.vrnt_id.unique()\n",
    "print(f\"Number of DUP overlapping the gene 3' end: {len(dup_overlap_3prime_gene_vrnt_id)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "528bc07b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# subset to required variant-level features \n",
    "vrnt_features_cols = [\"vrnt_id\", \"misexp_gene_id\", \"chrom\", \"sv_start\", \"sv_end\", \"misexp_gene_start\", \"misexp_gene_end\", \"misexp_gene_strand\"]\n",
    "dup_overlap_3prime_gene_trunc_df = dup_overlap_3prime_gene_df[vrnt_features_cols].drop_duplicates()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8fd8c70f",
   "metadata": {},
   "source": [
    "**Select DUPs that overlap gene 5'**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "6ba37d8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load variants in tested windows \n",
    "vrnt_id_in_window_chr_path = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_windows_misexp_genes.bed\")\n",
    "vrnt_id_in_window_chr_bed = BedTool(str(vrnt_id_in_window_chr_path))\n",
    "gencode_bed = BedTool(str(gencode_bed_path))\n",
    "# intersect SVs and genes \n",
    "sv_overlap_gene_str = StringIO(str(vrnt_id_in_window_chr_bed.intersect(gencode_bed, wo=True)))\n",
    "# read intersect between SVs and genes \n",
    "columns={0: \"sv_chrom\", 1: \"sv_start\", 2: \"sv_end\", 3:\"vrnt_id\", \n",
    "         4:\"gene_chrom\", 5:\"overlap_gene_start\", 6:\"overlap_gene_end\", 7:\"overlap_gene_id\", \n",
    "         8:\"score\", 9:\"overlap_gene_strand\", 10:\"gene_overlap\"\n",
    "        }\n",
    "sv_overlap_gene_df = pd.read_csv(sv_overlap_gene_str, sep=\"\\t\", header=None, dtype={3:str}).rename(columns=columns)\n",
    "# subset to relevant features \n",
    "sv_overlap_gene_feat_df = sv_overlap_gene_df[[\"vrnt_id\", \"overlap_gene_start\", \"overlap_gene_end\", \"overlap_gene_id\", \"overlap_gene_strand\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "cace0b4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def overlaps_gene_5_prime(row): \n",
    "    if row[\"overlap_gene_strand\"] == \"+\": \n",
    "        return (row[\"sv_start\"] < row[\"overlap_gene_start\"]) & (row[\"sv_end\"] > row[\"overlap_gene_start\"]) & (row[\"sv_end\"] < row[\"overlap_gene_end\"])\n",
    "    elif row[\"overlap_gene_strand\"] == \"-\": \n",
    "        return (row[\"sv_start\"] > row[\"overlap_gene_start\"]) & (row[\"overlap_gene_end\"] > row[\"sv_start\"]) & (row[\"overlap_gene_end\"] < row[\"sv_end\"])\n",
    "    else: \n",
    "        return np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "a6eacd4a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# add overlapping genes to misexpression DELs that are upstream \n",
    "misexp_dup_overlap_genes_df = pd.merge(dup_overlap_3prime_gene_trunc_df, \n",
    "                                       sv_overlap_gene_feat_df, \n",
    "                                       on=\"vrnt_id\", \n",
    "                                       how=\"left\")\n",
    "# annotate genes that overlap the 3' end \n",
    "misexp_dup_overlap_genes_df[\"overlap_gene_5_prime\"] = misexp_dup_overlap_genes_df.apply(overlaps_gene_5_prime, axis=1)\n",
    "# variant overlaps gene that is expressed in blood \n",
    "misexp_dup_overlap_genes_df[\"overlap_gene_expressed\"] = np.where(misexp_dup_overlap_genes_df.overlap_gene_id.isin(genes_median_tpm05), True, False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "4f0fe422",
   "metadata": {},
   "outputs": [],
   "source": [
    "misexp_dup_candidates_df = misexp_dup_overlap_genes_df[(misexp_dup_overlap_genes_df.overlap_gene_5_prime) &\n",
    "                                                      (misexp_dup_overlap_genes_df.overlap_gene_expressed) &\n",
    "                                                      (misexp_dup_overlap_genes_df.misexp_gene_strand == misexp_dup_overlap_genes_df.overlap_gene_strand)\n",
    "                                                     ].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "75c59b8a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of DUP fusion candidates: 4\n",
      "Tx fusion candidate DUPs:\n",
      " - 422023\n",
      " - 397951\n",
      " - 401916\n",
      " - 402648\n"
     ]
    }
   ],
   "source": [
    "misexp_dup_fusion_candidates = misexp_dup_candidates_df.vrnt_id.unique()\n",
    "print(f\"Number of DUP fusion candidates: {len(misexp_dup_fusion_candidates)}\")\n",
    "print(f\"Tx fusion candidate DUPs:\")\n",
    "for vrnt_id in misexp_dup_fusion_candidates: \n",
    "    print(f\" - {vrnt_id}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5137f90",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
